AD688 Lab 06: Visual Reporting and Storytelling

AD688 Lab 06: Visual Reporting and Storytelling

Author: Evelyn Zhou
Date: 2025-03-24

This notebook demonstrates how to load, transform, and visualize the Lightcast job postings dataset using Spark and Plotly.

import os
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Lightcast").getOrCreate()
df = spark.read.csv("lightcast_job_postings.csv", header=True, inferSchema=True)
df.printSchema()
[Stage 29:================================================>         (5 + 1) / 6]
                                                                                
root
 |-- ID: string (nullable = true)
 |-- LAST_UPDATED_DATE: string (nullable = true)
 |-- LAST_UPDATED_TIMESTAMP: string (nullable = true)
 |-- DUPLICATES: string (nullable = true)
 |-- POSTED: string (nullable = true)
 |-- EXPIRED: string (nullable = true)
 |-- DURATION: string (nullable = true)
 |-- SOURCE_TYPES: string (nullable = true)
 |-- SOURCES: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- ACTIVE_URLS: string (nullable = true)
 |-- ACTIVE_SOURCES_INFO: string (nullable = true)
 |-- TITLE_RAW: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- MODELED_EXPIRED: string (nullable = true)
 |-- MODELED_DURATION: string (nullable = true)
 |-- COMPANY: string (nullable = true)
 |-- COMPANY_NAME: string (nullable = true)
 |-- COMPANY_RAW: string (nullable = true)
 |-- COMPANY_IS_STAFFING: string (nullable = true)
 |-- EDUCATION_LEVELS: string (nullable = true)
 |-- EDUCATION_LEVELS_NAME: string (nullable = true)
 |-- MIN_EDULEVELS: string (nullable = true)
 |-- MIN_EDULEVELS_NAME: string (nullable = true)
 |-- MAX_EDULEVELS: string (nullable = true)
 |-- MAX_EDULEVELS_NAME: string (nullable = true)
 |-- EMPLOYMENT_TYPE: string (nullable = true)
 |-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
 |-- MIN_YEARS_EXPERIENCE: string (nullable = true)
 |-- MAX_YEARS_EXPERIENCE: string (nullable = true)
 |-- IS_INTERNSHIP: string (nullable = true)
 |-- SALARY: string (nullable = true)
 |-- REMOTE_TYPE: string (nullable = true)
 |-- REMOTE_TYPE_NAME: string (nullable = true)
 |-- ORIGINAL_PAY_PERIOD: string (nullable = true)
 |-- SALARY_TO: string (nullable = true)
 |-- SALARY_FROM: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITY_NAME: string (nullable = true)
 |-- COUNTY: string (nullable = true)
 |-- COUNTY_NAME: string (nullable = true)
 |-- MSA: string (nullable = true)
 |-- MSA_NAME: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- COUNTY_OUTGOING: string (nullable = true)
 |-- COUNTY_NAME_OUTGOING: string (nullable = true)
 |-- COUNTY_INCOMING: string (nullable = true)
 |-- COUNTY_NAME_INCOMING: string (nullable = true)
 |-- MSA_OUTGOING: string (nullable = true)
 |-- MSA_NAME_OUTGOING: string (nullable = true)
 |-- MSA_INCOMING: string (nullable = true)
 |-- MSA_NAME_INCOMING: string (nullable = true)
 |-- NAICS2: string (nullable = true)
 |-- NAICS2_NAME: string (nullable = true)
 |-- NAICS3: string (nullable = true)
 |-- NAICS3_NAME: string (nullable = true)
 |-- NAICS4: string (nullable = true)
 |-- NAICS4_NAME: string (nullable = true)
 |-- NAICS5: string (nullable = true)
 |-- NAICS5_NAME: string (nullable = true)
 |-- NAICS6: string (nullable = true)
 |-- NAICS6_NAME: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- TITLE_NAME: string (nullable = true)
 |-- TITLE_CLEAN: string (nullable = true)
 |-- SKILLS: string (nullable = true)
 |-- SKILLS_NAME: string (nullable = true)
 |-- SPECIALIZED_SKILLS: string (nullable = true)
 |-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
 |-- CERTIFICATIONS: string (nullable = true)
 |-- CERTIFICATIONS_NAME: string (nullable = true)
 |-- COMMON_SKILLS: string (nullable = true)
 |-- COMMON_SKILLS_NAME: string (nullable = true)
 |-- SOFTWARE_SKILLS: string (nullable = true)
 |-- SOFTWARE_SKILLS_NAME: string (nullable = true)
 |-- ONET: string (nullable = true)
 |-- ONET_NAME: string (nullable = true)
 |-- ONET_2019: string (nullable = true)
 |-- ONET_2019_NAME: string (nullable = true)
 |-- CIP6: string (nullable = true)
 |-- CIP6_NAME: string (nullable = true)
 |-- CIP4: string (nullable = true)
 |-- CIP4_NAME: string (nullable = true)
 |-- CIP2: string (nullable = true)
 |-- CIP2_NAME: string (nullable = true)
 |-- SOC_2021_2: string (nullable = true)
 |-- SOC_2021_2_NAME: string (nullable = true)
 |-- SOC_2021_3: string (nullable = true)
 |-- SOC_2021_3_NAME: string (nullable = true)
 |-- SOC_2021_4: string (nullable = true)
 |-- SOC_2021_4_NAME: string (nullable = true)
 |-- SOC_2021_5: string (nullable = true)
 |-- SOC_2021_5_NAME: string (nullable = true)
 |-- LOT_CAREER_AREA: string (nullable = true)
 |-- LOT_CAREER_AREA_NAME: string (nullable = true)
 |-- LOT_OCCUPATION: string (nullable = true)
 |-- LOT_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION: string (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_OCCUPATION_GROUP: string (nullable = true)
 |-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION: string (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION: string (nullable = true)
 |-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP: string (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_CAREER_AREA: string (nullable = true)
 |-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
 |-- SOC_2: string (nullable = true)
 |-- SOC_2_NAME: string (nullable = true)
 |-- SOC_3: string (nullable = true)
 |-- SOC_3_NAME: string (nullable = true)
 |-- SOC_4: string (nullable = true)
 |-- SOC_4_NAME: string (nullable = true)
 |-- SOC_5: string (nullable = true)
 |-- SOC_5_NAME: string (nullable = true)
 |-- LIGHTCAST_SECTORS: string (nullable = true)
 |-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
 |-- NAICS_2022_2: string (nullable = true)
 |-- NAICS_2022_2_NAME: string (nullable = true)
 |-- NAICS_2022_3: string (nullable = true)
 |-- NAICS_2022_3_NAME: string (nullable = true)
 |-- NAICS_2022_4: string (nullable = true)
 |-- NAICS_2022_4_NAME: string (nullable = true)
 |-- NAICS_2022_5: string (nullable = true)
 |-- NAICS_2022_5_NAME: string (nullable = true)
 |-- NAICS_2022_6: string (nullable = true)
 |-- NAICS_2022_6_NAME: string (nullable = true)
df = spark.read \
    .option("multiLine", True) \
    .option("escape", "\"") \
    .option("quote", "\"") \
    .csv("lightcast_job_postings.csv", header=True, inferSchema=True)
                                                                                
display_count = df.count()
print("Total number of records:", display_count)

df.show(5)
                                                                                
Total number of records: 72498
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|                  ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES|  POSTED|  EXPIRED|DURATION|        SOURCE_TYPES|             SOURCES|                 URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO|           TITLE_RAW|                BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY|        COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM|            LOCATION|                CITY|    CITY_NAME|COUNTY|   COUNTY_NAME|  MSA|            MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING|   MSA_NAME_OUTGOING|MSA_INCOMING|   MSA_NAME_INCOMING|NAICS2|         NAICS2_NAME|NAICS3|         NAICS3_NAME|NAICS4|         NAICS4_NAME|NAICS5|         NAICS5_NAME|NAICS6|         NAICS6_NAME|             TITLE|         TITLE_NAME|         TITLE_CLEAN|              SKILLS|         SKILLS_NAME|  SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME|      CERTIFICATIONS| CERTIFICATIONS_NAME|       COMMON_SKILLS|  COMMON_SKILLS_NAME|     SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME|      ONET|           ONET_NAME| ONET_2019|      ONET_2019_NAME|                CIP6|           CIP6_NAME|                CIP4|           CIP4_NAME|                CIP2|           CIP2_NAME|SOC_2021_2|     SOC_2021_2_NAME|SOC_2021_3|     SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME|  SOC_2|          SOC_2_NAME|  SOC_3|          SOC_3_NAME|  SOC_4|     SOC_4_NAME|  SOC_5|     SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2|   NAICS_2022_2_NAME|NAICS_2022_3|   NAICS_2022_3_NAME|NAICS_2022_4|   NAICS_2022_4_NAME|NAICS_2022_5|   NAICS_2022_5_NAME|NAICS_2022_6|   NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...|         9/6/2024|  2024-09-06 20:32:...|         0|6/2/2024| 6/8/2024|       6|   [\n  "Company"\n]|[\n  "brassring.c...|[\n  "https://sjo...|         []|               NULL|Enterprise Analys...|31-May-2024\n\nEn...|       6/8/2024|               6|  894731|          Murphy USA| Murphy USA|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   2|                   2|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR|  5139|     Union, AR|20980|       El Dorado, AR|    5|  Arkansas|           5139|           Union, AR|           5139|           Union, AR|       20980|       El Dorado, AR|       20980|       El Dorado, AR|    44|        Retail Trade|   441|Motor Vehicle and...|  4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n  "KS126DB6T06...|[\n  "Merchandisi...|[\n  "KS126DB6T06...|   [\n  "Merchandisi...|                  []|                  []|[\n  "KS126706DPF...|[\n  "Mathematics...|[\n  "KS440W865GC...|[\n  "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n  "45.0601",\n...|[\n  "Economics, ...|[\n  "45.06",\n  ...|[\n  "Economics",...|[\n  "45",\n  "27...|[\n  "Social Scie...|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101011|           General ERP Analy...|                2310|     Business Intellig...|                     23101011|              General ERP Analy...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  7\n]|  [\n  "Artificial ...|          44|        Retail Trade|         441|Motor Vehicle and...|        4413|Automotive Parts,...|       44133|Automotive Parts ...|      441330|Automotive Parts ...|
|0cb072af26757b6c4...|         8/2/2024|  2024-08-02 17:08:...|         0|6/2/2024| 8/1/2024|    NULL| [\n  "Job Board"\n]| [\n  "maine.gov"\n]|[\n  "https://job...|         []|               NULL|Oracle Consultant...|Oracle Consultant...|       8/1/2024|            NULL|  133098|Smx Corporation L...|        SMX|               true|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                   3|        false|  NULL|          1|          Remote|               NULL|     NULL|       NULL|{\n  "lat": 44.31...|    QXVndXN0YSwgTUU=|  Augusta, ME| 23011|  Kennebec, ME|12300|Augusta-Watervill...|   23|     Maine|          23011|        Kennebec, ME|          23011|        Kennebec, ME|       12300|Augusta-Watervill...|       12300|Augusta-Watervill...|    56|Administrative an...|   561|Administrative an...|  5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n  "KS122626T55...|[\n  "Procurement...|[\n  "KS122626T55...|   [\n  "Procurement...|                  []|                  []|                  []|                  []|[\n  "BGSBF3F508F...|[\n  "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          56|Administrative an...|         561|Administrative an...|        5613| Employment Services|       56132|Temporary Help Se...|      561320|Temporary Help Se...|
|85318b12b3331fa49...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024| 7/7/2024|      35| [\n  "Job Board"\n]|[\n  "dejobs.org"\n]|[\n  "https://dej...|         []|               NULL|        Data Analyst|Taking care of pe...|      6/10/2024|               8|39063746|            Sedgwick|   Sedgwick|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   5|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 32.77...|    RGFsbGFzLCBUWA==|   Dallas, TX| 48113|    Dallas, TX|19100|Dallas-Fort Worth...|   48|     Texas|          48113|          Dallas, TX|          48113|          Dallas, TX|       19100|Dallas-Fort Worth...|       19100|Dallas-Fort Worth...|    52|Finance and Insur...|   524|Insurance Carrier...|  5242|Agencies, Brokera...| 52429|Other Insurance R...|524291|    Claims Adjusting|ET3037E0C947A02404|      Data Analysts|        data analyst|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "ESF3939CE1F...|   [\n  "Exception R...|[\n  "KS683TN76T7...|[\n  "Security Cl...|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "KS126HY6YLT...|[\n  "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          52|Finance and Insur...|         524|Insurance Carrier...|        5242|Agencies, Brokera...|       52429|Other Insurance R...|      524291|    Claims Adjusting|
|1b5c3941e54a1889e...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024|7/20/2024|      48| [\n  "Job Board"\n]|[\n  "disabledper...|[\n  "https://www...|         []|               NULL|Sr. Lead Data Mgm...|About this role:\...|      6/12/2024|              10|37615159|         Wells Fargo|Wells Fargo|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.44...|    UGhvZW5peCwgQVo=|  Phoenix, AZ|  4013|  Maricopa, AZ|38060|Phoenix-Mesa-Chan...|    4|   Arizona|           4013|        Maricopa, AZ|           4013|        Maricopa, AZ|       38060|Phoenix-Mesa-Chan...|       38060|Phoenix-Mesa-Chan...|    52|Finance and Insur...|   522|Credit Intermedia...|  5221|Depository Credit...| 52211|  Commercial Banking|522110|  Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n  "KS123QX62QY...|[\n  "Exit Strate...|[\n  "KS123QX62QY...|   [\n  "Exit Strate...|                  []|                  []|[\n  "KS7G6NP6R6L...|[\n  "Reliability...|[\n  "KS4409D76NW...|[\n  "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  6\n]|  [\n  "Data Privac...|          52|Finance and Insur...|         522|Credit Intermedia...|        5221|Depository Credit...|       52211|  Commercial Banking|      522110|  Commercial Banking|
|cb5ca25f02bdf25c1...|        6/19/2024|   2024-06-19 07:00:00|         0|6/2/2024|6/17/2024|      15|[\n  "FreeJobBoar...|[\n  "craigslist....|[\n  "https://mod...|         []|               NULL|Comisiones de $10...|Comisiones de $10...|      6/17/2024|              15|       0|        Unclassified|      LH/GM|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              3|Part-time / full-...|                NULL|                NULL|        false| 92500|          0|          [None]|               year|   150000|      35000|{\n  "lat": 37.63...|    TW9kZXN0bywgQ0E=|  Modesto, CA|  6099|Stanislaus, CA|33700|         Modesto, CA|    6|California|           6099|      Stanislaus, CA|           6099|      Stanislaus, CA|       33700|         Modesto, CA|       33700|         Modesto, CA|    99|Unclassified Indu...|   999|Unclassified Indu...|  9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000|       Unclassified|comisiones de por...|                  []|                  []|                  []|                     []|                  []|                  []|                  []|                  []|                  []|                  []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          99|Unclassified Indu...|         999|Unclassified Indu...|        9999|Unclassified Indu...|       99999|Unclassified Indu...|      999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows

The data was successfully read and the partial and total number of rows were displayed using Spark. It can be seen that the file contains many columns, which can be used for visualization and analysis later.

import plotly.graph_objects as go
import plotly.io as pio

pio.templates["nike"] = go.layout.Template(
    layout={
        'title': {'font': {'family': 'HelveticaNeue-CondensedBold, Helvetica, Sans-serif',
                           'size': 30,
                           'color': '#333'}},
        'font': {'family': 'Helvetica Neue, Helvetica, Sans-serif',
                 'size': 16,
                 'color': '#333'},
        'colorway': ['#ec7424', '#a4abab'],
        'hovermode': 'x unified'
    },
    data={
        'bar': [go.Bar(texttemplate='%{value:$.2s}',
                       textposition='outside',
                       textfont={'family': 'Helvetica Neue, Helvetica, Sans-serif',
                                 'size': 20,
                                 'color': '#FFFFFF'})]
    }
)
import os
os.makedirs("_output", exist_ok=True)
!pip install nbformat ipython
Requirement already satisfied: nbformat in ./env/lib/python3.12/site-packages (5.10.4)
Requirement already satisfied: ipython in ./env/lib/python3.12/site-packages (9.0.2)
Requirement already satisfied: fastjsonschema>=2.15 in ./env/lib/python3.12/site-packages (from nbformat) (2.21.1)
Requirement already satisfied: jsonschema>=2.6 in ./env/lib/python3.12/site-packages (from nbformat) (4.23.0)
Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in ./env/lib/python3.12/site-packages (from nbformat) (5.7.2)
Requirement already satisfied: traitlets>=5.1 in ./env/lib/python3.12/site-packages (from nbformat) (5.14.3)
Requirement already satisfied: decorator in ./env/lib/python3.12/site-packages (from ipython) (5.2.1)
Requirement already satisfied: ipython-pygments-lexers in ./env/lib/python3.12/site-packages (from ipython) (1.1.1)
Requirement already satisfied: jedi>=0.16 in ./env/lib/python3.12/site-packages (from ipython) (0.19.2)
Requirement already satisfied: matplotlib-inline in ./env/lib/python3.12/site-packages (from ipython) (0.1.7)
Requirement already satisfied: pexpect>4.3 in ./env/lib/python3.12/site-packages (from ipython) (4.9.0)
Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./env/lib/python3.12/site-packages (from ipython) (3.0.50)
Requirement already satisfied: pygments>=2.4.0 in ./env/lib/python3.12/site-packages (from ipython) (2.19.1)
Requirement already satisfied: stack_data in ./env/lib/python3.12/site-packages (from ipython) (0.6.3)
Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./env/lib/python3.12/site-packages (from jedi>=0.16->ipython) (0.8.4)
Requirement already satisfied: attrs>=22.2.0 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (25.3.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (2024.10.1)
Requirement already satisfied: referencing>=0.28.4 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (0.36.2)
Requirement already satisfied: rpds-py>=0.7.1 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (0.23.1)
Requirement already satisfied: platformdirs>=2.5 in ./env/lib/python3.12/site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat) (4.3.7)
Requirement already satisfied: ptyprocess>=0.5 in ./env/lib/python3.12/site-packages (from pexpect>4.3->ipython) (0.7.0)
Requirement already satisfied: wcwidth in ./env/lib/python3.12/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython) (0.2.13)
Requirement already satisfied: executing>=1.2.0 in ./env/lib/python3.12/site-packages (from stack_data->ipython) (2.2.0)
Requirement already satisfied: asttokens>=2.1.0 in ./env/lib/python3.12/site-packages (from stack_data->ipython) (3.0.0)
Requirement already satisfied: pure-eval in ./env/lib/python3.12/site-packages (from stack_data->ipython) (0.2.3)
Requirement already satisfied: typing-extensions>=4.4.0 in ./env/lib/python3.12/site-packages (from referencing>=0.28.4->jsonschema>=2.6->nbformat) (4.12.2)
!pip install notebook
Requirement already satisfied: notebook in ./env/lib/python3.12/site-packages (7.3.3)
Requirement already satisfied: jupyter-server<3,>=2.4.0 in ./env/lib/python3.12/site-packages (from notebook) (2.15.0)
Requirement already satisfied: jupyterlab-server<3,>=2.27.1 in ./env/lib/python3.12/site-packages (from notebook) (2.27.3)
Requirement already satisfied: jupyterlab<4.4,>=4.3.6 in ./env/lib/python3.12/site-packages (from notebook) (4.3.6)
Requirement already satisfied: notebook-shim<0.3,>=0.2 in ./env/lib/python3.12/site-packages (from notebook) (0.2.4)
Requirement already satisfied: tornado>=6.2.0 in ./env/lib/python3.12/site-packages (from notebook) (6.4.2)
Requirement already satisfied: anyio>=3.1.0 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (4.9.0)
Requirement already satisfied: argon2-cffi>=21.1 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (23.1.0)
Requirement already satisfied: jinja2>=3.0.3 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (3.1.6)
Requirement already satisfied: jupyter-client>=7.4.4 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (8.6.3)
Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (5.7.2)
Requirement already satisfied: jupyter-events>=0.11.0 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (0.12.0)
Requirement already satisfied: jupyter-server-terminals>=0.4.4 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (0.5.3)
Requirement already satisfied: nbconvert>=6.4.4 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (7.16.6)
Requirement already satisfied: nbformat>=5.3.0 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (5.10.4)
Requirement already satisfied: overrides>=5.0 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (7.7.0)
Requirement already satisfied: packaging>=22.0 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (24.2)
Requirement already satisfied: prometheus-client>=0.9 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (0.21.1)
Requirement already satisfied: pyzmq>=24 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (26.3.0)
Requirement already satisfied: send2trash>=1.8.2 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (1.8.3)
Requirement already satisfied: terminado>=0.8.3 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (0.18.1)
Requirement already satisfied: traitlets>=5.6.0 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (5.14.3)
Requirement already satisfied: websocket-client>=1.7 in ./env/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->notebook) (1.8.0)
Requirement already satisfied: async-lru>=1.0.0 in ./env/lib/python3.12/site-packages (from jupyterlab<4.4,>=4.3.6->notebook) (2.0.5)
Requirement already satisfied: httpx>=0.25.0 in ./env/lib/python3.12/site-packages (from jupyterlab<4.4,>=4.3.6->notebook) (0.28.1)
Requirement already satisfied: ipykernel>=6.5.0 in ./env/lib/python3.12/site-packages (from jupyterlab<4.4,>=4.3.6->notebook) (6.29.5)
Requirement already satisfied: jupyter-lsp>=2.0.0 in ./env/lib/python3.12/site-packages (from jupyterlab<4.4,>=4.3.6->notebook) (2.2.5)
Requirement already satisfied: setuptools>=40.8.0 in ./env/lib/python3.12/site-packages (from jupyterlab<4.4,>=4.3.6->notebook) (78.0.2)
Requirement already satisfied: babel>=2.10 in ./env/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->notebook) (2.17.0)
Requirement already satisfied: json5>=0.9.0 in ./env/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->notebook) (0.10.0)
Requirement already satisfied: jsonschema>=4.18.0 in ./env/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->notebook) (4.23.0)
Requirement already satisfied: requests>=2.31 in ./env/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->notebook) (2.32.3)
Requirement already satisfied: idna>=2.8 in ./env/lib/python3.12/site-packages (from anyio>=3.1.0->jupyter-server<3,>=2.4.0->notebook) (3.10)
Requirement already satisfied: sniffio>=1.1 in ./env/lib/python3.12/site-packages (from anyio>=3.1.0->jupyter-server<3,>=2.4.0->notebook) (1.3.1)
Requirement already satisfied: typing_extensions>=4.5 in ./env/lib/python3.12/site-packages (from anyio>=3.1.0->jupyter-server<3,>=2.4.0->notebook) (4.12.2)
Requirement already satisfied: argon2-cffi-bindings in ./env/lib/python3.12/site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->notebook) (21.2.0)
Requirement already satisfied: certifi in ./env/lib/python3.12/site-packages (from httpx>=0.25.0->jupyterlab<4.4,>=4.3.6->notebook) (2025.1.31)
Requirement already satisfied: httpcore==1.* in ./env/lib/python3.12/site-packages (from httpx>=0.25.0->jupyterlab<4.4,>=4.3.6->notebook) (1.0.7)
Requirement already satisfied: h11<0.15,>=0.13 in ./env/lib/python3.12/site-packages (from httpcore==1.*->httpx>=0.25.0->jupyterlab<4.4,>=4.3.6->notebook) (0.14.0)
Requirement already satisfied: comm>=0.1.1 in ./env/lib/python3.12/site-packages (from ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.2.2)
Requirement already satisfied: debugpy>=1.6.5 in ./env/lib/python3.12/site-packages (from ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (1.8.13)
Requirement already satisfied: ipython>=7.23.1 in ./env/lib/python3.12/site-packages (from ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (9.0.2)
Requirement already satisfied: matplotlib-inline>=0.1 in ./env/lib/python3.12/site-packages (from ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.1.7)
Requirement already satisfied: nest-asyncio in ./env/lib/python3.12/site-packages (from ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (1.6.0)
Requirement already satisfied: psutil in ./env/lib/python3.12/site-packages (from ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (7.0.0)
Requirement already satisfied: MarkupSafe>=2.0 in ./env/lib/python3.12/site-packages (from jinja2>=3.0.3->jupyter-server<3,>=2.4.0->notebook) (3.0.2)
Requirement already satisfied: attrs>=22.2.0 in ./env/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->notebook) (25.3.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in ./env/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->notebook) (2024.10.1)
Requirement already satisfied: referencing>=0.28.4 in ./env/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->notebook) (0.36.2)
Requirement already satisfied: rpds-py>=0.7.1 in ./env/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->notebook) (0.23.1)
Requirement already satisfied: python-dateutil>=2.8.2 in ./env/lib/python3.12/site-packages (from jupyter-client>=7.4.4->jupyter-server<3,>=2.4.0->notebook) (2.9.0.post0)
Requirement already satisfied: platformdirs>=2.5 in ./env/lib/python3.12/site-packages (from jupyter-core!=5.0.*,>=4.12->jupyter-server<3,>=2.4.0->notebook) (4.3.7)
Requirement already satisfied: python-json-logger>=2.0.4 in ./env/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (3.3.0)
Requirement already satisfied: pyyaml>=5.3 in ./env/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (6.0.2)
Requirement already satisfied: rfc3339-validator in ./env/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (0.1.4)
Requirement already satisfied: rfc3986-validator>=0.1.1 in ./env/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (0.1.1)
Requirement already satisfied: beautifulsoup4 in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (4.13.3)
Requirement already satisfied: bleach!=5.0.0 in ./env/lib/python3.12/site-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (6.2.0)
Requirement already satisfied: defusedxml in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (0.7.1)
Requirement already satisfied: jupyterlab-pygments in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (0.3.0)
Requirement already satisfied: mistune<4,>=2.0.3 in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (3.1.3)
Requirement already satisfied: nbclient>=0.5.0 in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (0.10.2)
Requirement already satisfied: pandocfilters>=1.4.1 in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (1.5.1)
Requirement already satisfied: pygments>=2.4.1 in ./env/lib/python3.12/site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (2.19.1)
Requirement already satisfied: fastjsonschema>=2.15 in ./env/lib/python3.12/site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->notebook) (2.21.1)
Requirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.12/site-packages (from requests>=2.31->jupyterlab-server<3,>=2.27.1->notebook) (3.4.1)
Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.12/site-packages (from requests>=2.31->jupyterlab-server<3,>=2.27.1->notebook) (2.3.0)
Requirement already satisfied: ptyprocess in ./env/lib/python3.12/site-packages (from terminado>=0.8.3->jupyter-server<3,>=2.4.0->notebook) (0.7.0)
Requirement already satisfied: webencodings in ./env/lib/python3.12/site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (0.5.1)
Requirement already satisfied: tinycss2<1.5,>=1.1.0 in ./env/lib/python3.12/site-packages (from bleach[css]!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (1.4.0)
Requirement already satisfied: decorator in ./env/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (5.2.1)
Requirement already satisfied: ipython-pygments-lexers in ./env/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (1.1.1)
Requirement already satisfied: jedi>=0.16 in ./env/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.19.2)
Requirement already satisfied: pexpect>4.3 in ./env/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (4.9.0)
Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./env/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (3.0.50)
Requirement already satisfied: stack_data in ./env/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.6.3)
Requirement already satisfied: fqdn in ./env/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (1.5.1)
Requirement already satisfied: isoduration in ./env/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (20.11.0)
Requirement already satisfied: jsonpointer>1.13 in ./env/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (3.0.0)
Requirement already satisfied: uri-template in ./env/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (1.3.0)
Requirement already satisfied: webcolors>=24.6.0 in ./env/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (24.11.1)
Requirement already satisfied: six>=1.5 in ./env/lib/python3.12/site-packages (from python-dateutil>=2.8.2->jupyter-client>=7.4.4->jupyter-server<3,>=2.4.0->notebook) (1.17.0)
Requirement already satisfied: cffi>=1.0.1 in ./env/lib/python3.12/site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->notebook) (1.17.1)
Requirement already satisfied: soupsieve>1.2 in ./env/lib/python3.12/site-packages (from beautifulsoup4->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook) (2.6)
Requirement already satisfied: pycparser in ./env/lib/python3.12/site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->notebook) (2.22)
Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./env/lib/python3.12/site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.8.4)
Requirement already satisfied: wcwidth in ./env/lib/python3.12/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.2.13)
Requirement already satisfied: arrow>=0.15.0 in ./env/lib/python3.12/site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (1.3.0)
Requirement already satisfied: executing>=1.2.0 in ./env/lib/python3.12/site-packages (from stack_data->ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (2.2.0)
Requirement already satisfied: asttokens>=2.1.0 in ./env/lib/python3.12/site-packages (from stack_data->ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (3.0.0)
Requirement already satisfied: pure-eval in ./env/lib/python3.12/site-packages (from stack_data->ipython>=7.23.1->ipykernel>=6.5.0->jupyterlab<4.4,>=4.3.6->notebook) (0.2.3)
Requirement already satisfied: types-python-dateutil>=2.8.10 in ./env/lib/python3.12/site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->notebook) (2.9.0.20241206)
!pip install ipython nbformat
Requirement already satisfied: ipython in ./env/lib/python3.12/site-packages (9.0.2)
Requirement already satisfied: nbformat in ./env/lib/python3.12/site-packages (5.10.4)
Requirement already satisfied: decorator in ./env/lib/python3.12/site-packages (from ipython) (5.2.1)
Requirement already satisfied: ipython-pygments-lexers in ./env/lib/python3.12/site-packages (from ipython) (1.1.1)
Requirement already satisfied: jedi>=0.16 in ./env/lib/python3.12/site-packages (from ipython) (0.19.2)
Requirement already satisfied: matplotlib-inline in ./env/lib/python3.12/site-packages (from ipython) (0.1.7)
Requirement already satisfied: pexpect>4.3 in ./env/lib/python3.12/site-packages (from ipython) (4.9.0)
Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./env/lib/python3.12/site-packages (from ipython) (3.0.50)
Requirement already satisfied: pygments>=2.4.0 in ./env/lib/python3.12/site-packages (from ipython) (2.19.1)
Requirement already satisfied: stack_data in ./env/lib/python3.12/site-packages (from ipython) (0.6.3)
Requirement already satisfied: traitlets>=5.13.0 in ./env/lib/python3.12/site-packages (from ipython) (5.14.3)
Requirement already satisfied: fastjsonschema>=2.15 in ./env/lib/python3.12/site-packages (from nbformat) (2.21.1)
Requirement already satisfied: jsonschema>=2.6 in ./env/lib/python3.12/site-packages (from nbformat) (4.23.0)
Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in ./env/lib/python3.12/site-packages (from nbformat) (5.7.2)
Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./env/lib/python3.12/site-packages (from jedi>=0.16->ipython) (0.8.4)
Requirement already satisfied: attrs>=22.2.0 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (25.3.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (2024.10.1)
Requirement already satisfied: referencing>=0.28.4 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (0.36.2)
Requirement already satisfied: rpds-py>=0.7.1 in ./env/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (0.23.1)
Requirement already satisfied: platformdirs>=2.5 in ./env/lib/python3.12/site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat) (4.3.7)
Requirement already satisfied: ptyprocess>=0.5 in ./env/lib/python3.12/site-packages (from pexpect>4.3->ipython) (0.7.0)
Requirement already satisfied: wcwidth in ./env/lib/python3.12/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython) (0.2.13)
Requirement already satisfied: executing>=1.2.0 in ./env/lib/python3.12/site-packages (from stack_data->ipython) (2.2.0)
Requirement already satisfied: asttokens>=2.1.0 in ./env/lib/python3.12/site-packages (from stack_data->ipython) (3.0.0)
Requirement already satisfied: pure-eval in ./env/lib/python3.12/site-packages (from stack_data->ipython) (0.2.3)
Requirement already satisfied: typing-extensions>=4.4.0 in ./env/lib/python3.12/site-packages (from referencing>=0.28.4->jsonschema>=2.6->nbformat) (4.12.2)
from pyspark.sql.functions import col
filtered_emp = df.filter(
    (col("SALARY_FROM") > 0) & 
    (col("EMPLOYMENT_TYPE_NAME").isNotNull())
)

pdf_emp = filtered_emp.select("EMPLOYMENT_TYPE_NAME", "SALARY_FROM").toPandas()

fig = px.box(
    pdf_emp, 
    x="EMPLOYMENT_TYPE_NAME", 
    y="SALARY_FROM", 
    title="Salary Distribution by Employment Type"
)
fig.update_layout(template="nike")
fig.write_image("_output/salary_by_employment_type.svg")
fig.show()
                                                                                

Salary Distribution

There are significant differences in wage trends among different types of employment, and the existence of extremely high-paying jobs in some types leads to a higher overall distribution. Most jobs pay in the lower range.

filtered_ind = df.filter(
    (col("SALARY_FROM") > 0) & 
    (col("NAICS2_NAME").isNotNull())
)

pdf_ind = filtered_ind.select("NAICS2_NAME", "SALARY_FROM").toPandas()

fig = px.box(
    pdf_ind, 
    x="NAICS2_NAME", 
    y="SALARY_FROM", 
    title="Salary Distribution by Industry (NAICS2)"
)
fig.update_layout(template="nike")
fig.write_image("_output/salary_by_industry.svg")
fig.show()
                                                                                

Salary Distribution by Industry

As can be seen from the box plot, there are obvious differences in salary distribution among different industries (NAICS2_NAME), and the median salary level of some industries is significantly higher than that of other industries.

from pyspark.sql.functions import count
from pyspark.sql.functions import to_date, col

df = df.withColumn("POSTED_DATE", to_date(col("POSTED"), "yyyy/M/d"))
df_time = df.groupBy("POSTED").agg(count("*").alias("job_count"))

pdf_time = df_time.toPandas().sort_values("POSTED")


fig = px.line(
    pdf_time, 
    x="POSTED", 
    y="job_count", 
    title="Job Posting Trends Over Time"
)

fig.update_layout(template="nike")

fig.write_image("_output/job_posting_trends_over_time.svg")
fig.show()
                                                                                

Job Posting Trends Over Time

As can be seen from the line chart, the number of job posts shows a certain fluctuation trend during different days.

from pyspark.sql.functions import count, desc

df_titles = df.groupBy("TITLE_NAME").agg(count("*").alias("job_count"))

top_10_titles = df_titles.orderBy(desc("job_count")).limit(10).toPandas()

fig = px.bar(
    top_10_titles,
    x="TITLE_NAME",
    y="job_count",
    title="Top 10 Job Titles by Count"
)

fig.update_layout(template="nike")
fig.write_image("_output/top10_titles.svg")
fig.show()
                                                                                

Job Posting Trends Over Time

As can be seen from the bar chart, the number of “Data Analysts” positions is far ahead, reflecting the strong demand for data analysis positions in the current market. The next few jobs, such as “Unclassified”, “Business Intelligence” and “Enterprise Architect”, also accounted for a high volume of recruitment

from pyspark.sql.functions import count, col
import plotly.express as px

df_remote = df.groupBy("REMOTE_TYPE_NAME").agg(count("*").alias("job_count"))

pdf_remote = df_remote.toPandas()

fig = px.pie(
    pdf_remote, 
    names="REMOTE_TYPE_NAME", 
    values="job_count", 
    title="Remote vs On-Site Job Postings"
)

fig.update_layout(template="nike")

fig.write_image("_output/remote_vs_onsite.svg")
fig.show()
                                                                                

Remote vs On-Site Job Postings

As can be seen from the pie chart, the vast majority of jobs (about 78%) do not clearly label remote attributes, indicating that there are a lot of gaps in the data or undefined working patterns. The percentage of jobs that are truly labeled remote or hybrid remote is relatively small.

from pyspark.sql.functions import split, explode, col, count
import plotly.express as px

df_skills = df.filter(
    (col("NAICS2_NAME").isNotNull()) & 
    (col("SKILLS").isNotNull())
)

df_skills = df_skills.withColumn("skill_item", explode(split(col("SKILLS"), ",")))

df_skill_counts = df_skills.groupBy("NAICS2_NAME", "skill_item") \
                           .agg(count("*").alias("skill_count"))

pdf_skill_counts = df_skill_counts.toPandas()

fig = px.bar(
    pdf_skill_counts,
    x="NAICS2_NAME",
    y="skill_count",
    color="skill_item",
    title="Skil
    l Demand Analysis by Industry (Stacked Bar Chart)",
    barmode="stack"
)

fig.update_layout(template="nike")
fig.write_image("_output/skill_demand_by_industry.svg")
fig.show()
                                                                                

Skill Demand Analysis by Industry

As can be seen from the stacked bar chart, the demand for different skills varies significantly by industry, and the total demand of some industries is much higher than that of others. Each color represents a unique skill ID, and the higher the stack, the greater the demand for multiple skills in the industry.

from pyspark.sql.functions import percentile_approx, count

df_onet = df.filter(
    (df["ONET_NAME"].isNotNull()) &
    (df["SALARY_FROM"] > 0)
)

df_onet_agg = df_onet.groupBy("ONET_NAME").agg(
    percentile_approx("SALARY_FROM", 0.5).alias("median_salary"),
    count("*").alias("posting_count")
)

pdf_onet = df_onet_agg.toPandas()

fig = px.scatter(
    pdf_onet,
    x="ONET_NAME",
    y="median_salary",
    size="posting_count",
    hover_name="ONET_NAME", 
    title="Salary Analysis by ONET Occupation Type (Bubble Chart)"
)

fig.update_layout(template="nike")
fig.write_image("_output/salary_by_onet.svg")
fig.show()
                                                                                

Salary Analysis by ONET Occupation Type

As you can see from the chart, the median salary for the ONET career type “Business Intelligence Analysts” is about 88K. Because only one bubble appears in the figure, only this category of valid ONET occupation records or other occupation types are filtered in this dataset.

from pyspark.sql.functions import count
import plotly.graph_objects as go

df_sankey = df.filter(
    (df["SOC_2021_2_NAME"].isNotNull()) &
    (df["SOC_2021_3_NAME"].isNotNull())
)

df_sankey_agg = df_sankey.groupBy("SOC_2021_2_NAME", "SOC_2021_3_NAME") \
                         .agg(count("*").alias("transition_count"))

pdf_sankey = df_sankey_agg.toPandas()

unique_2 = pdf_sankey["SOC_2021_2_NAME"].unique().tolist()
unique_3 = pdf_sankey["SOC_2021_3_NAME"].unique().tolist()
all_nodes = list(set(unique_2 + unique_3))

node_map = {name: i for i, name in enumerate(all_nodes)}

pdf_sankey["source"] = pdf_sankey["SOC_2021_2_NAME"].map(node_map)
pdf_sankey["target"] = pdf_sankey["SOC_2021_3_NAME"].map(node_map)

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes
    ),
    link=dict(
        source=pdf_sankey["source"],
        target=pdf_sankey["target"],
        value=pdf_sankey["transition_count"]
    )
)])

fig.update_layout(
    title_text="Career Pathway Trends (Sankey Diagram)",
    template="nike"
)

fig.write_image("_output/career_pathway_sankey.svg")
fig.show()
                                                                                

Career Pathway Trends (Sankey Diagram)

This Sankey chart shows flows, or associations, from broader Computer and Mathematical Occupations to more specialized Mathematical Science Occupations.